Загрузка данных, обзывание
data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data", header=TRUE, sep=",")
colnames(data) <- c("sex", "length", "diameter", "height",
"whole_weight", "shucked_weight",
"viscera_weight", "shell_weight", "rings")
data$sex <- factor(c("Female", "Infant", "Male")[data$sex])
summary(data)
## sex length diameter height
## Female:1307 Min. :0.075 Min. :0.0550 Min. :0.0000
## Infant:1342 1st Qu.:0.450 1st Qu.:0.3500 1st Qu.:0.1150
## Male :1527 Median :0.545 Median :0.4250 Median :0.1400
## Mean :0.524 Mean :0.4079 Mean :0.1395
## 3rd Qu.:0.615 3rd Qu.:0.4800 3rd Qu.:0.1650
## Max. :0.815 Max. :0.6500 Max. :1.1300
## whole_weight shucked_weight viscera_weight shell_weight
## Min. :0.0020 Min. :0.0010 Min. :0.00050 Min. :0.0015
## 1st Qu.:0.4415 1st Qu.:0.1860 1st Qu.:0.09337 1st Qu.:0.1300
## Median :0.7997 Median :0.3360 Median :0.17100 Median :0.2340
## Mean :0.8288 Mean :0.3594 Mean :0.18061 Mean :0.2389
## 3rd Qu.:1.1533 3rd Qu.:0.5020 3rd Qu.:0.25300 3rd Qu.:0.3290
## Max. :2.8255 Max. :1.4880 Max. :0.76000 Max. :1.0050
## rings
## Min. : 1.000
## 1st Qu.: 8.000
## Median : 9.000
## Mean : 9.932
## 3rd Qu.:11.000
## Max. :29.000
Возможно интересные колонки
hist(data$diameter, main = "Диаметр, мм")

hist(data$height, main = "Высота, мм")

hist(data$whole_weight, main = "Полный вес, гр")

hist(data$rings, main = "Количество колец на ракушке, шт.")

Линейные модели по изначальным данным
Избавление от выбросов
data.noout <- data[complete.cases(data),]
data.noout <- data.noout[data.noout$diameter > .1 & data.noout$diameter < .6,]
data.noout <- data.noout[data.noout$height < .2 & data.noout$height > 0.05,]
data.noout <- data.noout[data.noout$whole_weight < 1.8,]
data.noout <- data.noout[data.noout$rings >= 5 & data.noout$rings < 20,]
hist(data.noout$diameter, main = "Диаметр, мм")

hist(data.noout$height, main = "Высота, мм")

hist(data.noout$whole_weight, main = "Полный вес, гр")

hist(data$rings, main = "Количество колец на ракушке, шт.")

Линейные модели по очищеным данным
Делим пополам
odds <- seq(1, nrow(data.noout), by=2)
data.in <- data.noout[odds,]
data.out <- data.noout[-odds,]
Предсказание веса по диаметру
lm.d.half <- lm(whole_weight ~ diameter, data=data.in)
summary(lm.d.half)
##
## Call:
## lm(formula = whole_weight ~ diameter, data = data.in)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48611 -0.09479 -0.01969 0.06878 0.91792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01689 0.01518 -67.01 <2e-16 ***
## diameter 4.43565 0.03669 120.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1395 on 1879 degrees of freedom
## Multiple R-squared: 0.8861, Adjusted R-squared: 0.886
## F-statistic: 1.462e+04 on 1 and 1879 DF, p-value: < 2.2e-16
data.d.pred <- predict(lm.d.half)
cor(data.in$whole_weight, data.d.pred)
## [1] 0.9413183
plot(data.in$whole_weight, data.d.pred)

data.d.pred.out <- predict(lm.d.half, data.out)
cor(data.out$whole_weight, data.d.pred.out)
## [1] 0.9397162
plot(data.out$whole_weight, data.d.pred.out)

Предсказание веса по высоте
lm.h.half <- lm(whole_weight ~ height, data=data.in)
summary(lm.h.half)
##
## Call:
## lm(formula = whole_weight ~ height, data = data.in)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.83191 -0.11657 -0.00807 0.10455 0.91126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.77739 0.01947 -39.94 <2e-16 ***
## height 11.39055 0.13894 81.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1932 on 1879 degrees of freedom
## Multiple R-squared: 0.7815, Adjusted R-squared: 0.7814
## F-statistic: 6721 on 1 and 1879 DF, p-value: < 2.2e-16
data.h.pred <- predict(lm.h.half)
cor(data.in$whole_weight, data.h.pred)
## [1] 0.8840352
plot(data.in$whole_weight, data.h.pred)

data.h.pred.out <- predict(lm.h.half, data.out)
cor(data.out$whole_weight, data.h.pred.out)
## [1] 0.8907289
plot(data.out$whole_weight, data.h.pred.out)

Предсказание кол-ва колец по весу
lm.r.half <- lm(rings ~ whole_weight, data=data.in)
summary(lm.r.half)
##
## Call:
## lm(formula = rings ~ whole_weight, data = data.in)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7233 -1.6340 -0.6424 1.0293 9.6802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.1402 0.1157 61.70 <2e-16 ***
## whole_weight 3.3466 0.1316 25.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.359 on 1879 degrees of freedom
## Multiple R-squared: 0.256, Adjusted R-squared: 0.2556
## F-statistic: 646.6 on 1 and 1879 DF, p-value: < 2.2e-16
data.r.pred <- predict(lm.r.half)
cor(data.in$rings, data.r.pred)
## [1] 0.5059697
plot(data.in$rings, data.r.pred)

data.r.pred.out <- predict(lm.r.half, data.out)
cor(data.out$rings, data.r.pred.out)
## [1] 0.5132176
plot(data.out$rings, data.r.pred.out)
